import requests
from multiprocessing import Pool
from lxml import etree
import pymongo
import time
from my_fake_useragent import UserAgent
ua = UserAgent()
client = pymongo.MongoClient('localhost',27017)
mydb = client['jianshu']
jianshu_shouye = mydb['shouye']

def get_info(url):
    print("开始爬取")
    headers = {'User-Agent':ua.random}
    html = requests.get(url,headers=headers)
    print('111')
    selector = etree.HTML(html.text)
    infos = selector.xpath('//ul[@class="note-list"]/li')
    for info in infos:
        print('-------')
        try:
            author = info.xpath('div/div/a[1]/text()')[0]
            title = info.xpath('div/a/text()')[0]
            content = info.xpath('div/p/text()')[0]
            comment = info.xpath('div/div/a[2]/text()')[1].strip()
            like = info.xpath('div/div/span[1]/text()')[0].strip()
            forwards = info.xpath('div/div/span[2]/text()')
            if len(forwards) == 0:
                forward = '无'
            else:
                forward = forwards[0].strip()
            data = {
                'author': author,
                'title': title,
                'content': content,
                'comment': comment,
                'like': like,
                'forward': forward
            }
            jianshu_shouye.insert_one(data)
            print("插入一条成功")
        except IndexError:
            print("错误")
            pass
if __name__ == '__main__':
    start = time.time()
    urls = ['https://www.jianshu.com/c/bDHhpK?order_by=added_at&page={}'.format(number)
            for number in range(1,10001)]
    pool = Pool(processes=5)
    pool.map(get_info,urls)
    end = time.time()
    print("总耗时:",end - start)